1 Contribution into project

Aleksandra Dąbrowska - preparing the data, association rules

Alicja Gosiewska - creating plots, association rules

Szymon Górka - creating Markov chains, clustering

Zuzanna Kasprowicz - clustering, making the knit

But each of us had contribution in all parts of our project, so this division into tasks does not reflect our real input. Each of us has done the same amount of work.

2 Introduction and preparing dataset

Copernicus Science Centre (Polish: Centrum Nauki Kopernik) is a science museum in Warsaw. It contains over 450 interactive exhibits that enable visitors to single-handedly carry out experiments and discover the laws of science for themselves. The Centre is the largest institution of its type in Poland and one of the most advanced in Europe.

Our task was to analyze and understand data from CNK. The dataset contains information about visitors and their interaction time with exhibits.

We were trying to answer the following questions:

  1. Which exhibit is usually the last and the first one?
  2. Are there some frequent paths/sub-paths?
  3. Are there any association rules?
  4. What is the probability of passing between exhibits?

We used the following libraries:

library(lubridate)
library(plyr)
library(dplyr)
library(ggplot2)
library(arules)
library(arulesViz)
library(cluster)
library(stringdist)
library(gridExtra)
library(reshape2)
library(igraph)
library(network)
library(sna)
library(ndtv)
library(qgraph)

We started from preparing the dataset as following:

  1. We chose visitors from January and deleted visitors numbered “-1”.
  2. For every visitor we calculated total time spending in each station.
  3. We set number of exhibits for each visitor - path length.
  4. We removed visitors who spend more than 9000 seconds (2,5h) in CNK or their path length was shorter than 5.
  5. Finally we got 3 datafames: dane_czyste, dane_sciezki and dane_klaster.
sciezka <-  "C:/Alicja/R/2016_Statystyka_II_Projekt_1/faza3/nowe/proba/raport" 
setwd(sciezka)

load("dane_czyste.rda")
head(dane_czyste)
##   visitor station            Time_max            Time_min Time_station
## 1  795453   cnk07 2012-01-03 10:34:14 2012-01-03 10:32:51      83 secs
## 2  795453   cnk09 2012-01-03 10:37:44 2012-01-03 10:35:55     109 secs
## 3  795453   cnk16 2012-01-03 10:38:40 2012-01-03 10:37:49      51 secs
## 4  795453   cnk23 2012-01-03 10:46:21 2012-01-03 10:46:16       5 secs
## 5  795453   cnk61 2012-01-03 10:53:18 2012-01-03 10:53:00      18 secs
## 6  795453  cnk29a 2012-01-03 11:00:27 2012-01-03 10:54:52     335 secs
##   Time_total Path_length isFirst isLast
## 1       883           10   cnk07   <NA>
## 2       883           10    <NA>   <NA>
## 3       883           10    <NA>   <NA>
## 4       883           10    <NA>   <NA>
## 5       883           10    <NA>   <NA>
## 6       883           10    <NA>   <NA>
load("dane_sciezki.rda")
head(dane_sciezki)
##   visitor                                       Path
## 1  795453                     070916236129a69572442a
## 2  795455                           46a29a6610216972
## 3  795468                                 0307172112
## 4  795476 67071716121878a26612225694929a58b555742a38
## 5  795477                           262446a563258b55
## 6  795486                        02b0366122356327961
load("dane_klaster.rda")
head(dane_klaster)
##   visitor Time_total Path_length
## 1  795453       883           10
## 2  795455      1458            7
## 3  795468       330            5
## 4  795476      1507           19
## 5  795477      1006            7
## 6  795486       851            9
load("smallLogs.rda")
dane<-smallLogs

library(lubridate)

dane["month"] <- NA
dane$month <- month(dane$date)

dane<-subset(dane,(dane$month=="1"))

#czyszczenie danych
dane<-subset(dane,(dane$visitor!="-1"))


library(plyr)
Time_min<-aggregate(date~visitor+station,dane, FUN=min)
Time_max<-aggregate(date~visitor+station,dane, FUN=max)
library(dplyr)
Time<-left_join(Time_max,Time_min,by=c("visitor","station"))

Time$Time_total <- Time[,3] - Time[,4]
Time<-Time[order(Time[,1],Time[,3]),]


Path_count<-count(Time,visitor)

dane_czas<-aggregate(Time_total~visitor,Time,FUN=sum)
dane_czas<-left_join(dane_czas,Path_count,by="visitor")


dane_czyste<-left_join(Time,dane_czas,by="visitor")
colnames(dane_czyste)[3]<-"Time_max"
colnames(dane_czyste)[4]<-"Time_min"
colnames(dane_czyste)[5]<-"Time_station"
colnames(dane_czyste)[6]<-"Time_total"
colnames(dane_czyste)[7]<-"Path_length"

dane_czyste<-subset(dane_czyste,(dane_czyste$Time_total<9000))
dane_czyste<-subset(dane_czyste,(dane_czyste$Path_length>4))

Temp<-aggregate(Time_min~visitor,dane_czyste,FUN=min)
colnames(Temp)[2] <- "temp_min"
dane_czyste<-left_join(dane_czyste,Temp,by = "visitor")
dane_czyste$isFirst <- ifelse(dane_czyste$Time_min==dane_czyste$temp_min,as.character(dane_czyste$station),NA)

Temp<-aggregate(Time_max~visitor,dane_czyste,FUN=max)
colnames(Temp)[2] <- "temp_max"
dane_czyste<-left_join(dane_czyste,Temp,by = "visitor")
dane_czyste$isLast <- ifelse(dane_czyste$Time_max==dane_czyste$temp_max,as.character(dane_czyste$station),NA)
rm(Temp)
dane_czyste<-dane_czyste[,-c(8,10)]

dane_sciezki<- aggregate(dane_czyste$station, dane_czyste["visitor"], paste, collapse="")
colnames(dane_sciezki)[2]<-"Path"
dane_sciezki$Path<-gsub('cnk','',dane_sciezki$Path)


dane_klaster <- distinct(dane_czyste[,c(1,6,7)])

3 Let’s make the dataset more friendly!

In this chapter we will see some basic statistisc and interesting information about visitors and exhibits in CNK.

Average time spent in CNK (in seconds)

mean(as.numeric(dane_klaster$Time_total))
## [1] 1329.065

Median time spent in CNK (in seconds)

median(as.numeric(dane_klaster$Time_total))
## [1] 920

Average path length

mean(dane_klaster$Path_length)
## [1] 11.60206

Median path length

median(dane_klaster$Path_length)
## [1] 10

For each station we can see the number of visitors who visited it:

library(ggplot2)
library(dplyr)
nowa <- group_by(dane_czyste, station) %>% 
  summarise(n())
colnames(nowa)[2] <- "liczba"

nowa$station <- factor(nowa$station, levels = nowa$station[order(nowa$liczba)])

ggplot(nowa, aes(x = factor(station), y = liczba)) + geom_bar(stat = "identity")+ coord_flip()

From the above plot we discovered that the most frequent station is cnk16 and less frequent is cnk29a.

Let’s see plots which show us for each station quantity of guests who start and end their visit in CNK from this station.

library(ggplot2)

library(dplyr)
nowa2 <- group_by(dane_czyste, isFirst) %>% 
  summarise(n())
colnames(nowa2)[2] <- "liczba"

nowa2<-nowa2[-nrow(nowa2),]
nowa2$isFirst <- factor(nowa2$isFirst, levels = nowa2$isFirst[order(nowa2$liczba)])

ggplot(nowa2, aes(x = factor(isFirst), y = liczba)) + geom_bar(stat = "identity")+ coord_flip()+ggtitle("Frequency of the first station on the path")

nowa3 <- group_by(dane_czyste, isLast) %>% 
  summarise(n())
colnames(nowa3)[2] <- "liczba"

nowa3<-nowa3[-nrow(nowa3),]
nowa3$isLast <- factor(nowa3$isLast, levels = nowa3$isLast[order(nowa3$liczba)])

ggplot(nowa3, aes(x = factor(isLast), y = liczba)) + geom_bar(stat = "identity")+ coord_flip()+ggtitle("Frequency of the last station on the path")

From the above plots we can see that the first most frequent station was cnk02a and the last was cnk63a.

4 Clustering

In this chapter we will discuss methods of clustering and optimal number of clusters.

We started from the simplest method of clustering - k-means method. We tried to cluster by total time spend in CNK and path length for one visitor.

At first we normed our data because time and length are in different scales.

library(cluster)
library(stringdist)
library(dplyr)
dane_klaster <- distinct(dane_czyste[,c(1,6,7)])

#norming data
dane_klaster$Time_total_norm <- scale(sqrt(as.numeric(dane_klaster$Time_total)))
dane_klaster$Path_length_norm <- scale(sqrt(dane_klaster$Path_length))

To find optimal number of clusters we calculated the distances within clusters, from 2 to 10.

library(cluster)
library(stringdist)
library(dplyr)
Kmax <- 10
WC <- sapply(2:Kmax, function(k) {
  grupy <- kmeans(dane_klaster[,c(4,5)], 
                  centers = k, nstart = 10)
  sum(grupy$withinss)
})

Next we calculated the distances between clusters, from 2 to 10.

library(cluster)
library(stringdist)
library(dplyr)
Bmax <- 10
WB <- sapply(2:Bmax, function(k) {
  grupy <- kmeans(dane_klaster[,c(4,5)], 
                  centers = k, nstart = 10)
  sum(grupy$betweenss)
})

Let’s compare our results:

library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
p1 <- ggplot(data.frame(K=factor(2:Kmax), WC), aes(K, WC)) +
  geom_bar(stat="identity")+ylim(0,30000)+ggtitle("Distances within clusters")
p2 <- ggplot(data.frame(K=factor(2:Bmax), WB), aes(K, WB)) +
  geom_bar(stat="identity")+ylim(0,30000)+ggtitle("Distances between clusters")
grid.arrange(p1,p2,ncol=2)

We can conclude that the optimal number of clusters is 4, because for numbers bigger than 4 we can observe slower decrease the distances within clusters and slower increase the distances between clusters.

For optimal number we have the plot of partition the data into clusters.

library(cluster)
library(stringdist)
library(dplyr)
grupy <- kmeans(dane_klaster[,c(4,5)],centers = 4, nstart = 10)
dane_klaster$Group_k <- factor(grupy$cluster)

ggplot(dane_klaster, aes(Time_total_norm, Path_length_norm, color=Group_k)) +  geom_point(size=2) +  theme_bw()+ggtitle("Division into clusters")

Next we wanted to take into consideration composition of our paths. We analyzed the diffrences between paths.

For got the matrix of distaneces between paths we used the Damerau-Levenshtein distance. In this metric a distance between two strings, given by counting the minimum number of operations needed to transform one string into the other, where an operation is defined as an insertion, deletion, or substitution of a single character, or a transposition of two adjacent characters.

The Damerau-Levenshtein distance differs from the classical Levenshtein distance by including transpositions among its allowable operations. The classical Levenshtein distance only allows insertion, deletion, and substitution operations.

library(stringdist)
distances <- stringdistmatrix(dane_sciezki$Path, method="dl")
distances <- as.matrix(distances)
distances <- as.data.frame(distances)

In this case we used the CLARA function, because CLARA is focused on clustering large datasets

load("distances.rda") 
library(cluster)
klastry <- clara(distances, k=4)

We presented division on clusters on the following plot:

5 Association rules

For better understanding the dataset we looked for some common rules which describe visited stations. Our first step was creating the matrix of transactions, which contains the information about stations visited by each visitor.

library(arules)
library(arulesViz)
library(ggplot2)
toApriopri <- dane_czyste[,c(2,1)]
toApriopri <-t(table(toApriopri))
toApriopri <- toApriopri[, colSums(toApriopri!=0) > 0]
toApriopri <- as.data.frame(toApriopri)
toApriopri <- toApriopri >= 1 
  
trans <- as(toApriopri, "transactions")

As a result we obtained sparse matrix

library(arules)
library(arulesViz)
trans@data[1:10,1:50] #sprawdzenie
## 10 x 50 sparse Matrix of class "ngCMatrix"
##                                                                          
##  [1,] . . . . . . . . . . . . | . . . . . | | . . . . . | . . . | . . . .
##  [2,] . . . . . | . . . . . . . . . . . . . . . . . . | | . . . . . . . .
##  [3,] . . | . . | . . . . . . . . . . . . | | . . | . . . | . | . . . | .
##  [4,] . . . . . . | . . | | | . . . . . . | . . . . . | . . . | . . | | |
##  [5,] . . . . . . . . . . . . . . | . . . . . . . . . . . . . . . . . . .
##  [6,] | . | | . . . . . . . . . . . . . . | . . . . . . | . . . . . | . .
##  [7,] | . . . . . . | . . | . . . . . . . | . . . . . . . . . | . | | | .
##  [8,] . . . . . . . . . . . . . . . . . . . . . . . . . | . . . . . . . .
##  [9,] . | . . . . | . | . | . . . . . . . . | | . | . . | . . | | . | . |
## [10,] . . . . . . . . . . | . . . . . . . | . . . . . | . . . . . . | | .
##                                      
##  [1,] . . . | | . . . . . | | . . | |
##  [2,] . . . . . | . . . . . . . . . .
##  [3,] . . . | . . . . . | . . | | | .
##  [4,] | . . | | | . . . . . . . . | .
##  [5,] | . . . | | . . . . . . . . | .
##  [6,] . . . . | | | | . . | . | | | .
##  [7,] | . . | | | . . . . | . | | | .
##  [8,] . . . . . . . . . . . . . . . .
##  [9,] . . . | . | . . . . | . . . . .
## [10,] | . . | . . | . . | | . . | . .
image(head(trans,300)) #rzadka macierz

We were looking for rules.

library(arules)
library(arulesViz)
rules <- apriori(trans, parameter = list(support = .02)) 
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport support minlen maxlen
##         0.8    0.1    1 none FALSE            TRUE    0.02      1     10
##  target   ext
##   rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 294 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[59 item(s), 14731 transaction(s)] done [0.00s].
## sorting and recoding items ... [59 item(s)] done [0.02s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.15s].
## writing ... [93 rule(s)] done [0.00s].
## creating S4 object  ... done [0.02s].
smaller_rules<-subset(rules,subset=rhs %in% c("cnk12","cnk16"))
rules
## set of 93 rules
library(arules)
library(arulesViz)
library(ggplot2)
itemFrequencyPlot(trans, topN =20)

reguly <- inspect(rules)
##    lhs                                rhs      support    confidence
## 1  {cnk19b}                        => {cnk19a} 0.09395153 0.9808646 
## 2  {cnk06,cnk19b}                  => {cnk19a} 0.02321635 0.9913043 
## 3  {cnk19b,cnk23}                  => {cnk19a} 0.02009368 0.9833887 
## 4  {cnk19b,cnk61}                  => {cnk19a} 0.02138348 0.9843750 
## 5  {cnk19b,cnk67}                  => {cnk19a} 0.02097617 0.9840764 
## 6  {cnk19b,cnk39}                  => {cnk19a} 0.02090829 0.9777778 
## 7  {cnk11,cnk19b}                  => {cnk19a} 0.02688209 0.9729730 
## 8  {cnk02b,cnk19b}                 => {cnk19a} 0.02599959 0.9820513 
## 9  {cnk19b,cnk21}                  => {cnk19a} 0.02138348 0.9843750 
## 10 {cnk19b,cnk69}                  => {cnk19a} 0.02179078 0.9876923 
## 11 {cnk19b,cnk20}                  => {cnk19a} 0.02389519 0.9643836 
## 12 {cnk07,cnk19b}                  => {cnk19a} 0.02457403 0.9863760 
## 13 {cnk02a,cnk19b}                 => {cnk19a} 0.03197339 0.9731405 
## 14 {cnk05,cnk19b}                  => {cnk19a} 0.02701785 0.9827160 
## 15 {cnk17,cnk19b}                  => {cnk19a} 0.02661055 0.9775561 
## 16 {cnk19b,cnk66}                  => {cnk19a} 0.02538864 0.9739583 
## 17 {cnk10,cnk19b}                  => {cnk19a} 0.02620324 0.9846939 
## 18 {cnk09,cnk19b}                  => {cnk19a} 0.02681420 0.9729064 
## 19 {cnk19b,cnk78a}                 => {cnk19a} 0.02986898 0.9843400 
## 20 {cnk18,cnk19b}                  => {cnk19a} 0.02919014 0.9862385 
## 21 {cnk12,cnk19b}                  => {cnk19a} 0.03149820 0.9727463 
## 22 {cnk16,cnk19b}                  => {cnk19a} 0.04161293 0.9745628 
## 23 {cnk16,cnk20,cnk21,cnk67}       => {cnk12}  0.02043310 0.8005319 
## 24 {cnk05,cnk16,cnk21,cnk67}       => {cnk12}  0.02097617 0.8005181 
## 25 {cnk09,cnk21,cnk66,cnk67}       => {cnk12}  0.02016156 0.8250000 
## 26 {cnk16,cnk21,cnk66,cnk67}       => {cnk12}  0.02199443 0.8000000 
## 27 {cnk09,cnk16,cnk21,cnk67}       => {cnk12}  0.02240174 0.8168317 
## 28 {cnk09,cnk20,cnk66,cnk67}       => {cnk12}  0.02056887 0.8681948 
## 29 {cnk09,cnk16,cnk20,cnk67}       => {cnk12}  0.02213020 0.8190955 
## 30 {cnk09,cnk16,cnk20,cnk72}       => {cnk12}  0.02016156 0.8005391 
## 31 {cnk11,cnk20,cnk21,cnk66}       => {cnk12}  0.02002580 0.8060109 
## 32 {cnk09,cnk11,cnk20,cnk21}       => {cnk12}  0.02084040 0.8143236 
## 33 {cnk11,cnk16,cnk20,cnk21}       => {cnk12}  0.02308058 0.8018868 
## 34 {cnk07,cnk11,cnk16,cnk21}       => {cnk12}  0.02199443 0.8000000 
## 35 {cnk05,cnk11,cnk16,cnk21}       => {cnk12}  0.02090829 0.8083990 
## 36 {cnk09,cnk11,cnk21,cnk66}       => {cnk12}  0.02090829 0.8062827 
## 37 {cnk09,cnk11,cnk20,cnk66}       => {cnk12}  0.02117982 0.8432432 
## 38 {cnk07,cnk11,cnk12,cnk66}       => {cnk16}  0.02301269 0.8149038 
## 39 {cnk11,cnk12,cnk17,cnk66}       => {cnk16}  0.02301269 0.8033175 
## 40 {cnk05,cnk07,cnk20,cnk21}       => {cnk12}  0.02043310 0.8246575 
## 41 {cnk07,cnk17,cnk20,cnk21}       => {cnk12}  0.02226597 0.8220551 
## 42 {cnk07,cnk20,cnk21,cnk66}       => {cnk12}  0.02206232 0.8145363 
## 43 {cnk07,cnk10,cnk20,cnk21}       => {cnk12}  0.02179078 0.8065327 
## 44 {cnk07,cnk09,cnk20,cnk21}       => {cnk12}  0.02375942 0.8454106 
## 45 {cnk07,cnk16,cnk20,cnk21}       => {cnk12}  0.02599959 0.8080169 
## 46 {cnk05,cnk17,cnk20,cnk21}       => {cnk12}  0.02117982 0.8342246 
## 47 {cnk05,cnk20,cnk21,cnk66}       => {cnk12}  0.02240174 0.8250000 
## 48 {cnk05,cnk10,cnk20,cnk21}       => {cnk12}  0.02280904 0.8296296 
## 49 {cnk05,cnk09,cnk20,cnk21}       => {cnk12}  0.02240174 0.8418367 
## 50 {cnk05,cnk16,cnk20,cnk21}       => {cnk12}  0.02538864 0.8077754 
## 51 {cnk17,cnk20,cnk21,cnk66}       => {cnk12}  0.02457403 0.8302752 
## 52 {cnk10,cnk17,cnk20,cnk21}       => {cnk12}  0.02280904 0.8215159 
## 53 {cnk09,cnk17,cnk20,cnk21}       => {cnk12}  0.02491345 0.8475751 
## 54 {cnk16,cnk17,cnk20,cnk21}       => {cnk12}  0.02735727 0.8108652 
## 55 {cnk10,cnk20,cnk21,cnk66}       => {cnk12}  0.02335211 0.8000000 
## 56 {cnk09,cnk20,cnk21,cnk66}       => {cnk12}  0.02661055 0.8448276 
## 57 {cnk20,cnk21,cnk66,cnk78a}      => {cnk12}  0.02029733 0.8081081 
## 58 {cnk09,cnk10,cnk20,cnk21}       => {cnk12}  0.02457403 0.8246014 
## 59 {cnk09,cnk20,cnk21,cnk78a}      => {cnk12}  0.02043310 0.8224044 
## 60 {cnk09,cnk18,cnk20,cnk21}       => {cnk12}  0.02056887 0.8015873 
## 61 {cnk09,cnk16,cnk20,cnk21}       => {cnk12}  0.02973322 0.8081181 
## 62 {cnk05,cnk07,cnk17,cnk21}       => {cnk12}  0.02029733 0.8037634 
## 63 {cnk05,cnk07,cnk10,cnk21}       => {cnk12}  0.02104406 0.8031088 
## 64 {cnk05,cnk07,cnk09,cnk21}       => {cnk12}  0.02165501 0.8035264 
## 65 {cnk07,cnk17,cnk21,cnk66}       => {cnk12}  0.02260539 0.8024096 
## 66 {cnk07,cnk09,cnk21,cnk78a}      => {cnk12}  0.02009368 0.8131868 
## 67 {cnk05,cnk17,cnk21,cnk66}       => {cnk12}  0.02172290 0.8080808 
## 68 {cnk05,cnk09,cnk17,cnk21}       => {cnk12}  0.02233385 0.8004866 
## 69 {cnk05,cnk09,cnk21,cnk66}       => {cnk12}  0.02369153 0.8097448 
## 70 {cnk05,cnk09,cnk10,cnk21}       => {cnk12}  0.02389519 0.8018223 
## 71 {cnk09,cnk21,cnk66,cnk78a}      => {cnk12}  0.02104406 0.8010336 
## 72 {cnk05,cnk07,cnk20,cnk66}       => {cnk12}  0.02104406 0.8072917 
## 73 {cnk07,cnk17,cnk20,cnk66}       => {cnk12}  0.02396307 0.8286385 
## 74 {cnk07,cnk09,cnk17,cnk20}       => {cnk12}  0.02321635 0.8028169 
## 75 {cnk07,cnk09,cnk20,cnk66}       => {cnk12}  0.02409884 0.8352941 
## 76 {cnk07,cnk16,cnk20,cnk66}       => {cnk12}  0.02735727 0.8027888 
## 77 {cnk07,cnk09,cnk16,cnk20}       => {cnk12}  0.02776458 0.8019608 
## 78 {cnk05,cnk17,cnk20,cnk66}       => {cnk12}  0.02274116 0.8271605 
## 79 {cnk05,cnk10,cnk20,cnk66}       => {cnk12}  0.02437038 0.8013393 
## 80 {cnk05,cnk09,cnk20,cnk66}       => {cnk12}  0.02403096 0.8329412 
## 81 {cnk10,cnk17,cnk20,cnk66}       => {cnk12}  0.02470980 0.8070953 
## 82 {cnk09,cnk17,cnk20,cnk66}       => {cnk12}  0.02633901 0.8416486 
## 83 {cnk17,cnk20,cnk66,cnk78a}      => {cnk12}  0.02063675 0.8328767 
## 84 {cnk09,cnk10,cnk20,cnk66}       => {cnk12}  0.02559229 0.8160173 
## 85 {cnk09,cnk20,cnk66,cnk78a}      => {cnk12}  0.02280904 0.8421053 
## 86 {cnk09,cnk18,cnk20,cnk66}       => {cnk12}  0.02192655 0.8136020 
## 87 {cnk09,cnk16,cnk20,cnk66}       => {cnk12}  0.03136243 0.8105263 
## 88 {cnk16,cnk20,cnk66,cnk78a}      => {cnk12}  0.02593171 0.8042105 
## 89 {cnk09,cnk16,cnk20,cnk21,cnk66} => {cnk12}  0.02090829 0.8555556 
## 90 {cnk09,cnk16,cnk17,cnk21,cnk66} => {cnk12}  0.02043310 0.8179348 
## 91 {cnk09,cnk16,cnk17,cnk20,cnk66} => {cnk12}  0.02104406 0.8587258 
## 92 {cnk09,cnk10,cnk12,cnk20,cnk66} => {cnk16}  0.02056887 0.8037135 
## 93 {cnk09,cnk10,cnk16,cnk20,cnk66} => {cnk12}  0.02056887 0.8416667 
##    lift    
## 1  7.636954
## 2  7.718237
## 3  7.656606
## 4  7.664285
## 5  7.661961
## 6  7.612920
## 7  7.575510
## 8  7.646193
## 9  7.664285
## 10 7.690114
## 11 7.508633
## 12 7.679865
## 13 7.576814
## 14 7.651369
## 15 7.611194
## 16 7.583182
## 17 7.666768
## 18 7.574992
## 19 7.664013
## 20 7.678795
## 21 7.573745
## 22 7.587888
## 23 2.339345
## 24 2.339304
## 25 2.410846
## 26 2.337790
## 27 2.386976
## 28 2.537072
## 29 2.393592
## 30 2.339365
## 31 2.355355
## 32 2.379647
## 33 2.343304
## 34 2.337790
## 35 2.362334
## 36 2.356150
## 37 2.464157
## 38 2.081920
## 39 2.052319
## 40 2.409845
## 41 2.402240
## 42 2.380269
## 43 2.356880
## 44 2.470491
## 45 2.361217
## 46 2.437803
## 47 2.410846
## 48 2.424375
## 49 2.460047
## 50 2.360512
## 51 2.426262
## 52 2.400665
## 53 2.476816
## 54 2.369541
## 55 2.337790
## 56 2.468787
## 57 2.361484
## 58 2.409681
## 59 2.403261
## 60 2.342429
## 61 2.361513
## 62 2.348788
## 63 2.346875
## 64 2.348095
## 65 2.344832
## 66 2.376325
## 67 2.361404
## 68 2.339212
## 69 2.366267
## 70 2.343115
## 71 2.340811
## 72 2.359098
## 73 2.421479
## 74 2.346022
## 75 2.440928
## 76 2.345940
## 77 2.343520
## 78 2.417160
## 79 2.341704
## 80 2.434052
## 81 2.358524
## 82 2.459497
## 83 2.433864
## 84 2.384597
## 85 2.460832
## 86 2.377538
## 87 2.368551
## 88 2.350094
## 89 2.500137
## 90 2.390200
## 91 2.509401
## 92 2.053331
## 93 2.459550
reguly$n <- row(reguly)[,1]
ggplot(reguly, aes(x=support, y=confidence, label=n, color=lift))+geom_point(size=3) +geom_text(aes(label=n),hjust=2, vjust=2)+ggtitle("Scatter plot for 93 rules")

As we can see, rule 1 has large support and confidence, but we found it as not interserting, because on the Right Hand Side it contains only the entering of station cnk19 and on the Left Hand Side it contains the exit of station cnk19. Next 21 rules are also not interesting, because they look like rule 1 with added one more station. So taking this into consideration, we dropped first 22 rules.

library(ggplot2)
library(arules)
ggplot(reguly[-c(1:22),], aes(x=support, y=confidence, label=n, color=lift))+geom_point(size=3) +geom_text(aes(label=n),hjust=1.5, vjust=1.5)+ggtitle("Scatter plot for 71 rules")

We found rules 56 and 82 interesting.

library(arules)
reguly[c(56,82),]
##                          lhs        rhs    support confidence     lift  n
## 56 {cnk09,cnk20,cnk21,cnk66} => {cnk12} 0.02661055  0.8448276 2.468787 56
## 82 {cnk09,cnk17,cnk20,cnk66} => {cnk12} 0.02633901  0.8416486 2.459497 82

To better understand this rules, we presented it on the map of CNK. First, let’s look at the full map:

Map of the CNK

Map of the CNK

Now, we zoom in our map to show rules number 56 and 82.

zoomed map

zoomed map

Next we used library arulesViz to present our rules.

1.Graph plot: Used to visualize association rules using vertices and edges where vertices typically represent items or item-sets and edges indicate relationship in rules. First there is a graph for all 93 the rules and the second one is for 71 rules.

library(arules)

smaller_rules<-subset(rules,subset=rhs %in% c("cnk12","cnk16")) #<- uci?ta klasa regul do nr 22-93

plot(head(sort(rules, by="lift"), 93),  method="graph", control=list(cex=.7)) 

plot(head(sort(smaller_rules, by="lift"), 71),  method="graph", control=list(cex=.7))

We see that the rule {cnk19b} => {cnk19a} is most common one. Our rules divide into two groups. At the second graph we get rules without first 22 rules, and now we have only one group with longer rules, but they have smaller lift than the first 22 rules.

2.Grouped Matrix: It is similar to matrix based plot. Here rules are grouped to present as an aggregate in the matrix.

library(ggplot2)
library(arules)

plot(head(sort(rules, by="lift"), 93),  method="grouped", control=list(cex=.7))
## Available parameter (with default values):
## main  =  Grouped matrix for 93 rules
## k     =  20
## aggr.fun  =  function (x, na.rm = FALSE)  UseMethod("median")
## col   =  c("#D33F6A", "#D34269", "#D44468", "#D54667", "#D54866", "#D64A65", "#D74C63", "#D74E62", "#D85061", "#D95260", "#D9545F", "#DA565E", "#DB585D", "#DB5A5B", "#DC5C5A", "#DC5E59", "#DD6058", "#DD6256", "#DE6355", "#DF6554", "#DF6753", "#E06951", "#E06B50", "#E16D4F", "#E16E4D", "#E2704C", "#E2724B", "#E27449", "#E37548", "#E37747", "#E47945", "#E47B44", "#E47C42", "#E57E41", "#E58040", "#E6823E", "#E6833D", "#E6853B", "#E6873A", "#E78939", "#E78A37", "#E78C36", "#E88E35", "#E88F33", "#E89132", "#E89331",  "#E89430", "#E9962E", "#E9982D", "#E99A2C", "#E99B2B", "#E99D2B", "#E99F2A", "#E9A029", "#E9A229", "#EAA428", "#EAA528", "#EAA728", "#EAA928", "#EAAA28", "#EAAC28", "#EAAE29", "#EAAF29", "#EAB12A", "#EAB32B", "#E9B42C", "#E9B62D", "#E9B82F", "#E9B930", "#E9BB32", "#E9BC33", "#E9BE35", "#E9C037", "#E8C139", "#E8C33B", "#E8C53D", "#E8C640", "#E8C842", "#E7C944", "#E7CB47", "#E7CD4A", "#E7CE4C", "#E6D04F", "#E6D152", "#E6D355", "#E5D458", "#E5D65B", "#E5D85E", "#E5D961", "#E4DB64", "#E4DC68", "#E4DE6C",  "#E3DF6F", "#E3E173", "#E3E278", "#E2E37D", "#E2E582", "#E2E688", "#E2E791", "#E2E6BD")
## reverse   =  TRUE
## xlab  =  NULL
## ylab  =  NULL
## legend    =  size: support  color: lift
## panel.function    =  function (row, size, shading, spacing)  {     size[size == 0] <- NA     shading[is.na(shading)] <- 1     grid.circle(x = c(1:length(size)), y = row, r = size/2 * (1 - spacing), default.units = "native", gp = gpar(fill = shading, alpha = 0.9)) }
## spacing   =  -1
## newpage   =  TRUE
## gp_labels     =  list(cex = 0.8)
## gp_panels     =  list()
## interactive   =  FALSE
## max.shading   =  NA
## verbose   =  FALSE

plot(head(sort(smaller_rules, by="lift"), 71),  method="grouped", control=list(cex=.7))
## Available parameter (with default values):
## main  =  Grouped matrix for 71 rules
## k     =  20
## aggr.fun  =  function (x, na.rm = FALSE)  UseMethod("median")
## col   =  c("#D33F6A", "#D34269", "#D44468", "#D54667", "#D54866", "#D64A65", "#D74C63", "#D74E62", "#D85061", "#D95260", "#D9545F", "#DA565E", "#DB585D", "#DB5A5B", "#DC5C5A", "#DC5E59", "#DD6058", "#DD6256", "#DE6355", "#DF6554", "#DF6753", "#E06951", "#E06B50", "#E16D4F", "#E16E4D", "#E2704C", "#E2724B", "#E27449", "#E37548", "#E37747", "#E47945", "#E47B44", "#E47C42", "#E57E41", "#E58040", "#E6823E", "#E6833D", "#E6853B", "#E6873A", "#E78939", "#E78A37", "#E78C36", "#E88E35", "#E88F33", "#E89132", "#E89331",  "#E89430", "#E9962E", "#E9982D", "#E99A2C", "#E99B2B", "#E99D2B", "#E99F2A", "#E9A029", "#E9A229", "#EAA428", "#EAA528", "#EAA728", "#EAA928", "#EAAA28", "#EAAC28", "#EAAE29", "#EAAF29", "#EAB12A", "#EAB32B", "#E9B42C", "#E9B62D", "#E9B82F", "#E9B930", "#E9BB32", "#E9BC33", "#E9BE35", "#E9C037", "#E8C139", "#E8C33B", "#E8C53D", "#E8C640", "#E8C842", "#E7C944", "#E7CB47", "#E7CD4A", "#E7CE4C", "#E6D04F", "#E6D152", "#E6D355", "#E5D458", "#E5D65B", "#E5D85E", "#E5D961", "#E4DB64", "#E4DC68", "#E4DE6C",  "#E3DF6F", "#E3E173", "#E3E278", "#E2E37D", "#E2E582", "#E2E688", "#E2E791", "#E2E6BD")
## reverse   =  TRUE
## xlab  =  NULL
## ylab  =  NULL
## legend    =  size: support  color: lift
## panel.function    =  function (row, size, shading, spacing)  {     size[size == 0] <- NA     shading[is.na(shading)] <- 1     grid.circle(x = c(1:length(size)), y = row, r = size/2 * (1 - spacing), default.units = "native", gp = gpar(fill = shading, alpha = 0.9)) }
## spacing   =  -1
## newpage   =  TRUE
## gp_labels     =  list(cex = 0.8)
## gp_panels     =  list()
## interactive   =  FALSE
## max.shading   =  NA
## verbose   =  FALSE

6 Markov Chain

At the end we wanted to present some special object - Markov Chain.

Initially we prepared data in order to get the number of passing between each two exhibits.

load("tab.rda")
colnames(tab)[1] <- "from"
colnames(tab)[2] <- "to"
head(tab)
##     from     to Freq
## 1 cnk02a cnk02a    0
## 2 cnk02b cnk02a  519
## 3  cnk03 cnk02a   32
## 4  cnk05 cnk02a  127
## 5  cnk06 cnk02a  161
## 6  cnk07 cnk02a   54

Then we wanted to show all the stations on the graph, but it doesn’t look good - there is too much passing and arrows. This makes this graph unintelligible. Let’s see it:

library(reshape2)
library(igraph)
library(network)
library(sna)
library(qgraph)
library(ndtv)
library(dplyr)
razem <- tab
links <- tab

t1 <- razem[!duplicated(razem$station),]
nodes <- t1

colnames(tab)[1] <- "from"
colnames(tab)[2] <- "to"
suma<- aggregate(Freq~from,tab,sum)
tab <- left_join(tab,suma, by="from")
colnames(tab)[3]<- "prob"
tab$prob <- tab$prob/tab$Freq.y

qgraph(tab[,-4], edge.labels = TRUE)

That is why we decided to present only graphs for two rules:

tab_reguly<-subset(tab,(tab$from=="cnk09"|tab$from=="cnk12"|tab$from=="cnk17"|tab$from=="cnk20"|tab$from=="cnk21"|tab$from=="cnk66")&(tab$to=="cnk09"|tab$to=="cnk12"|tab$to=="cnk17"|tab$to=="cnk20"|tab$to=="cnk21"|tab$to=="cnk66"))

tab_reguly<-subset(tab_reguly[,-4],tab_reguly$from!=tab_reguly$to)

library(qgraph)

qgraph(tab_reguly, edge.labels = TRUE)

7 Bibliography

http://shakthydoss.com/mining-associations-apriori-using-r-part-2/

http://stackoverflow.com/

https://www.gitbook.com/book/pbiecek/przewodnik/details

https://en.wikipedia.org/wiki/Copernicus_Science_Centre